package org.luosl.webmagicx.conf

import java.io.File

import org.luosl.webmagicx.Utils.Logging

import scala.collection.immutable.Seq
import scala.xml._

/**
  * Created by luosl on 2017/11/7.
  */
class ConfLoader(val confPath:String) extends Logging{

  /**
    * 根据路径解析所有课配置文件
    * @return
    */
  def listAllConf():Array[SpiderConf] = listConfFile().map(parseConfFile)

  /**
    * 列类路径下的爬虫配置文件
    */
  private def listConfFile(): Array[File] ={
    if(null==confPath) throw new RuntimeException(s"无效的配置类路径:$confPath")
    val confFile:File = new File(confPath)
    if(confFile.isDirectory){
      new File(confPath).listFiles().filter(_.getName.endsWith("spider.xml"))
    }else{
      Array(confFile)
    }
  }


  /**
    * 解析配置文件
    *
    * @param confFile 配置文件
    * @return
    */
  def parseConfFile(confFile:File): SpiderConf ={

    /**
      * 解析 component 节点
      * @param node node
      * @return
      */
    def parseCompNode(node: Node): Component ={
      // 解析组件类型
      val defClazz:String = node.label match {
        case "processor" => DefaultConfValue.processorClass
        case "pipeline" => DefaultConfValue.pipelineClass
        case "scheduler" => DefaultConfValue.schedulerClass
        case "handler" => null
        case _ => throw new RuntimeException(s"无效的组件:${node.label},请检查配置文件:${confFile.getAbsolutePath}")
      }
      val clazz:String = lastStrValue(node \ "class", defClazz)
      val props:Map[String, String] = (node \\ "props").flatMap{ propsNode=>
        propsNode.child.filterNot(_.isInstanceOf[Text]).map{ propNode=>
          val keyOpt:Option[String] = if(propNode.label == "prop") {
            findFirstNoEmptyNodeSeq(propNode \"@key",propNode \"key").map(_.text)
          }else{
            Option(propNode.label)
          }
          val valueOpt:Option[String] = findFirstNoEmptyNodeSeq(propNode \ "@value", propNode).map(_.text)
          if(keyOpt.isEmpty || valueOpt.isEmpty)
            throw new RuntimeException(s"prop 属性缺少 key 或者value 请检查配置文件:${confFile.getAbsolutePath}")
          keyOpt.get -> valueOpt.get
        }
      }.toMap
      Component(clazz, props)
    }
    /**
      * 获取最后一个，或者默认的组件
      * @param nodeSeq nodeSeq
      * @return
      */
    def lastCompOpt(nodeSeq: NodeSeq):Option[Component] = nodeSeq.lastOption.map(parseCompNode)
    // 获取根节点
    val root:Elem = XML.loadFile(confFile)
    // 解析是否启用
    val enable = asBoolean(root \ "@enable",DefaultConfValue.enable)
    // 解析描述信息
    val desc:String = lastStrValue(root \ "desc", "")
    // 解析 startUrls
    val startUrls:Seq[String] = (root \ "startUrls" \\ "url").map(_.text)
    // 解析 目标url
    val targetUrlRegexs:Seq[TargetUrlRegex] = (root \ "targetUrlRegexs" \\ "regex").map{ node=>
      val xpathOpt:Option[String] = (node \ "@xpath").headOption.map(_.text)
      TargetUrlRegex(xpathOpt, node.text)
    }
    // 解析 task
    val task:Option[Task] = (root \ "task").lastOption.map{ node=>
      Task(
        asBoolean(node \ "@startNow"),
        (node \ "@corn").lastOption.map(_.text).getOrElse(throw new RuntimeException("task节点corn为必要属性"))
      )
    }
    // 解析 attribute
    val maxDeep = asInt(root \ "attribute" \ "maxDeep", DefaultConfValue.maxDeep)
    val charset:String = (root \ "attribute"\ "charset").headOption.map(_.text).getOrElse(DefaultConfValue.charset)
    val timeout:Int = asInt(root \ "attribute"\ "timeout", DefaultConfValue.timeout)
    val threadNum:Int = asInt(root \ "attribute"\ "threadNum", DefaultConfValue.timeout)
    val retryTimes:Int = asInt(root \ "attribute"\ "retryTimes", DefaultConfValue.timeout)
    val attr:Attr = Attr(maxDeep, charset, timeout, threadNum, retryTimes)
    // 解析 components
    // 解析 processor
    val processorComp:Option[Component] = lastCompOpt(root \ "components" \ "processor")
    // 解析handler
    val handlers:Seq[Component] = (root \ "components" \ "handler").map(parseCompNode)
    // 解析pipeline
    val pipelines:Seq[Component] = {
      val pipelines:Seq[Component] = (root \ "components" \ "pipeline").map(parseCompNode)
      if(pipelines.isEmpty) {
        Component(DefaultConfValue.pipelineClass,Map.empty[String,String]) :: pipelines.toList
      }else{
        pipelines
      }
    }
    // 解析 scheduler
    val schedulerComp:Option[Component] = lastCompOpt(root \ "components" \ "scheduler")
    val components:Components = Components(processorComp,handlers,pipelines,schedulerComp)
    // 解析site
    val userAgent:String = lastStrValue(root \ "site" \ "userAgent","")
    val headers:Map[String, String] = (root \ "site" \ "headers" \ "header")
      .map(node=> (node \@ "key", node \@ "value"))
      .toMap
    val cookies:Map[String, String] = (root \ "site" \ "cookies" \ "cookie")
      .map(node=> (node \@ "key", node \@ "value"))
      .toMap
    val site = Site(userAgent,headers,cookies)
    // 解析 Field
    val fields:Seq[Field] = (root \ "fields" \ "field").map{ fieldNode=>
      val name = (fieldNode \ "name").text
      val scopeAndXpaths:(String,Seq[String]) = {
        val xpathsNode:NodeSeq = fieldNode \ "xpaths"
        if(xpathsNode.isEmpty){
          (DefaultConfValue.xpathsScope,Seq((fieldNode \ "xpath").text) )
        }else{
          val scope = lastStrValue(xpathsNode \ "@scope", DefaultConfValue.xpathsScope)
          val xpaths = (xpathsNode \ "xpath").map(_.text)
          (scope,xpaths)
        }
      }
      val textFormat:Boolean = asBoolean(fieldNode \ "textFormat", DefaultConfValue.textFormat)
      val must:Boolean = asBoolean(fieldNode \ "must", DefaultConfValue.must)
      Field(name,scopeAndXpaths._2,scopeAndXpaths._1,textFormat,must)
    }
    SpiderConf(desc, startUrls,targetUrlRegexs,task,components,fields,site,attr,confFile.getAbsolutePath,enable)
  }

  /**
    * 找到第一个非空的 nodeSeq
    * @param nodeSeq nodeSeq
    * @return
    */
  private def findFirstNoEmptyNodeSeq(nodeSeq: NodeSeq*): Option[NodeSeq] = nodeSeq.find(_.nonEmpty)

  /**
    * 获取node的第一个text值
    * @param node node
    * @param defVal defVal
    * @return
    */
  private def lastStrValue(node:NodeSeq, defVal:String): String ={
    node.lastOption.map(_.text).getOrElse(defVal)
  }
  /**
    * 转int
    * @param node node
    * @param defVal defVal
    */
  private def asInt(node:NodeSeq, defVal:Int = 1): Int ={
    try{
      node.text.toInt
    }catch {
      case e:Exception =>
        logWarning(s"$node 无法转换为Int类型，将给出默认值[$defVal]", e)
        defVal
    }
  }

  /**
    * 转boolean
    * @param node node
    * @param defVal defVal
    * @return
    */
  private def asBoolean(node:NodeSeq,defVal:Boolean = true): Boolean ={
    try{
      node.text.toBoolean
    }catch {
      case e:Exception =>
        logWarning(s"$node 无法转换为Boolean类型，将给出默认值[$defVal]", e)
        defVal
    }
  }

}

object ConfLoader extends App{
  def apply(confPath:String):ConfLoader = new ConfLoader(confPath)
  def apply():ConfLoader = apply("spiderConf")
  ConfLoader().listAllConf()
}
